In [None]:
import pytz
import datetime
import marimo as mo

india_timezone = pytz.timezone("Asia/Kolkata")
now = datetime.datetime.now(india_timezone)

curr = now.strftime("%Y-%m-%d, %I:%M:%S %p %Z")

mo.md(
    rf"""
# Week - 2

**Submission Date:** `2025-10-05, 23:59 IST`

**Last Updated:** `{curr}`
"""
)

### Import required packages

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler

In [None]:
df = pd.read_csv("Week-2/Graded Assignment/dataset.csv")

### Basic Dataset Inspection

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

### Question 1

Which of the following columns have object datatype?

In [None]:
df.dtypes[df.dtypes == "object"]

### Question 2

In this dataset, how many **"Males"** from **"Europe"** have made **"InGamePurchases"**?

In [None]:
df[
    (df["Gender"] == "Male")
    & (df["Location"] == "Europe")
    & (df["InGamePurchases"] == 1)
].shape[0]

### Question 3

In your dataset, how many players under the **"Age"** 18 have strictly greater than 10 **"PlayTimeHours"**?

In [None]:
df[(df["Age"] < 18) & (df["PlayTimeHours"] > 10)].shape[0]

### Question 4

Create feature matrix(X) and label vector(y) using following instructions:

**"EngagementLevel"** is the target column(y).

All the columns except the target column are in feature matrix(X).

How many total null values were present in the whole dataset?

In [None]:
X = df.drop(columns=["EngagementLevel"])
y = df["EngagementLevel"]

In [None]:
df.isnull().sum().sum()

### Question 5

Split the dataset into train dataset and test dataset in the following manner.

Use sklearn train_test_split function to split the data.

Use only 20% data as test_set and keep random_state = 42

Which category has the least value counts in y_train?

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
y_train.value_counts().idxmin()

## Common Instructions for Question 6 and 7

Rules for Imputing the missing(NaN) or Unknown values:

- Calculating statistical values (such as mean, median, mode) for each column in the training dataset.
- Applying these calculated statistical values to replace missing (NaN) and unknown values in both the training and test datasets.
- Ensure that the calculation of statistical values excludes any rows containing missing or unknown values.
- Replace Unknown values in the **"Age"** feature with the Mean value in that.
- Replace Unknown values in the **"Location"** feature with the constant value **"Other"**.
- Replace Unknown values in the **"GameDifficulty"** feature with the Most Frequent value in that.
- Replace Unknown values in the **"InGamePurchases"** feature with the constant value '0'.
- Write the answers related to the above imputation in below questions respectively.

In [None]:
X_train["Age"].isna().sum(), X_test["Age"].isna().sum()

In [None]:
train_mean = X_train["Age"].mean()
X_train["Age"] = X_train["Age"].fillna(train_mean)

test_mean = X_test["Age"].mean()
X_test["Age"] = X_test["Age"].fillna(test_mean)

In [None]:
X_train["Age"].isna().sum(), X_test["Age"].isna().sum()

In [None]:
X_train["Location"].isna().sum(), X_test["Location"].isna().sum()

In [None]:
X_train["Location"] = X_train["Location"].fillna("Other")
X_test["Location"] = X_test["Location"].fillna("Other")

In [None]:
X_train["Location"].isna().sum(), X_test["Location"].isna().sum()

In [None]:
X_train["GameDifficulty"].isna().sum(), X_test["GameDifficulty"].isna().sum()

In [None]:
X_train["GameDifficulty"] = X_train["GameDifficulty"].fillna(
    X_train["GameDifficulty"].mode()[0]
)
X_test["GameDifficulty"] = X_test["GameDifficulty"].fillna(
    X_test["GameDifficulty"].mode()[0]
)

In [None]:
X_train["GameDifficulty"].isna().sum(), X_test["GameDifficulty"].isna().sum()

In [None]:
X_train["InGamePurchases"].isna().sum(), X_test["InGamePurchases"].isna().sum()

In [None]:
X_train["InGamePurchases"] = X_train["InGamePurchases"].fillna(0)
X_test["InGamePurchases"] = X_test["InGamePurchases"].fillna(0)

In [None]:
X_train["InGamePurchases"].isna().sum(), X_test["InGamePurchases"].isna().sum()

### Question 6

Write the sum of transformed(imputed) "Age" column of the test dataset. (upto 2 digits after decimal points)

In [None]:
round(X_test["Age"].sum(), 2)

### Question 7

Apply preprocessing on features of train and test datasets.

- Drop the **"PlayerID"** Column before the preprocessing steps.
- Before applying any preprocessing there should not be any missing or unknown values present in the train and test dataset.
- Learn transformers' parameters using training set only and then transform train & test sets using them.

    - **For Categorical Features**

        - **Ordinal Features**

            Ordinally Encode **"GameDifficulty"**

            | **GameDifficulty** | **Order** |
            | ------------------ | --------- |
            | Easy               | 0         |
            | Medium             | 1         |
            | Hard               | 2         |

        - **Nominal Features**

            - One-Hot Encode **'Gender'**, **'Location'**, **'GameGenre'** features and keep `drop_first = True`.

        - **Scaling Features**

            - Scale all the features (transformed categorical and numerical) of the feature matrix using the StandardScaler

Calculate the sum of all the values present in first five rows of transformed test feature matrix ? (upto 2 digits afer the decimal)

In [None]:
X_train.drop("PlayerID", axis=1, inplace=True)
X_test.drop("PlayerID", axis=1, inplace=True)

In [None]:
assert X_train.isna().sum().sum() == 0
assert X_test.isna().sum().sum() == 0

## Difference between

```python
X_train[["GameDifficulty"]]
```

and

```python
X_train["GameDifficulty"]
```

in **pandas**:

---

### 1. `X_train["GameDifficulty"]`

* Returns a **Series**.
* Shape: `(n,)` where `n` is the number of rows.
* One-dimensional, index-aligned.
* Example:

  ```python
  type(X_train["GameDifficulty"])
  # pandas.Series
  X_train["GameDifficulty"].shape
  # (1000,)   # if 1000 rows
  ```

---

### 2. `X_train[["GameDifficulty"]]`

* Returns a **DataFrame**.
* Shape: `(n, 1)`.
* Two-dimensional, column remains in DataFrame format.
* Example:

  ```python
  type(X_train[["GameDifficulty"]])
  # pandas.DataFrame
  X_train[["GameDifficulty"]].shape
  # (1000, 1)
  ```

---

### ðŸ”‘ Why does this matter?

* **Series (`[]` single bracket)** â†’ when you just need the raw column vector.
* **DataFrame (`[[]]` double bracket)** â†’ when you want to keep it as a DataFrame, e.g., for scikit-learn models which expect 2D arrays as input (`.fit(X, y)`).

---

âœ… Example usage:

```python
# This may break if X needs to be 2D
model.fit(X_train["GameDifficulty"], y_train)

# This works (keeps it 2D)
model.fit(X_train[["GameDifficulty"]], y_train)
```

In [None]:
ordinal_encoder = OrdinalEncoder(categories=[["Easy", "Medium", "Hard"]])
ordinal_encoder

In [None]:
X_train["GameDifficulty"] = ordinal_encoder.fit_transform(
    X_train[["GameDifficulty"]]
)

X_test["GameDifficulty"] = ordinal_encoder.fit_transform(
    X_test[["GameDifficulty"]]
)

In [None]:
# One-Hot Encode the nominal categorical features with drop_first=True
_ohe_encoder = OneHotEncoder(
    drop="first",  # drop the first category to avoid multicollinearity
    sparse_output=False,  # return a dense array
    handle_unknown="ignore",  # safely handle unseen categories in test set
)

# Fit the encoder on the training data
_ohe_encoder.fit(X_train[["Gender", "Location", "GameGenre"]])

# Transform both train and test sets
_X_train_ohe = _ohe_encoder.transform(
    X_train[["Gender", "Location", "GameGenre"]]
)
_X_test_ohe = _ohe_encoder.transform(
    X_test[["Gender", "Location", "GameGenre"]]
)

# Convert the output to DataFrames with appropriate column names
_ohe_columns = _ohe_encoder.get_feature_names_out(
    ["Gender", "Location", "GameGenre"]
)
_X_train_ohe_df = pd.DataFrame(
    _X_train_ohe, columns=_ohe_columns, index=X_train.index
)
_X_test_ohe_df = pd.DataFrame(
    _X_test_ohe, columns=_ohe_columns, index=X_test.index
)

# Drop the original nominal columns from the feature matrices
X_train.drop(columns=["Gender", "Location", "GameGenre"], inplace=True)
X_test.drop(columns=["Gender", "Location", "GameGenre"], inplace=True)

# Concatenate the encoded columns back to the feature matrices
X_train[:] = pd.concat([X_train, _X_train_ohe_df], axis=1)
X_test[:] = pd.concat([X_test, _X_test_ohe_df], axis=1)

# Optional: verify that there are no remaining categorical columns
X_train.dtypes

In [None]:
# Initialize StandardScaler
_scaler = StandardScaler()

# Fit scaler on training data
_scaler.fit(X_train)

# Transform both training and test data
X_train_scaled_array = _scaler.transform(X_train)
X_test_scaled_array = _scaler.transform(X_test)

# Convert scaled arrays back to DataFrames with original column names and indices
X_train_scaled = pd.DataFrame(
    X_train_scaled_array, columns=X_train.columns, index=X_train.index
)

X_test_scaled = pd.DataFrame(
    X_test_scaled_array, columns=X_test.columns, index=X_test.index
)

# Optional: verify scaling (mean ~0, std ~1) for training set
X_train_scaled.describe().loc[["mean", "std"]]

In [None]:
round(X_test.iloc[:5].to_numpy().sum(), 2)